Skip to content

Conversation

akadutta
Copy link
Contributor

This is an extension of #157968 . Adds support for V_PK_MOV_B32. Removes a few redundant checks, and reduces MF scans for archs that do not support this optimization.

@llvmbot
Copy link
Member

llvmbot commented Oct 14, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Akash Dutta (akadutta)

Changes

This is an extension of #157968 . Adds support for V_PK_MOV_B32. Removes a few redundant checks, and reduces MF scans for archs that do not support this optimization.


Patch is 78.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/163464.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (+110-75)
  • (modified) llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir (+589-12)
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index c684f9e3a6e2c..379dc4f282743 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -47,9 +47,6 @@ class SIPreEmitPeephole {
                              const MachineBasicBlock &From,
                              const MachineBasicBlock &To) const;
   bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
-  // Check if the machine instruction being processed is a supported packed
-  // instruction.
-  bool isUnpackingSupportedInstr(MachineInstr &MI) const;
   // Creates a list of packed instructions following an MFMA that are suitable
   // for unpacking.
   void collectUnpackingCandidates(MachineInstr &BeginMI,
@@ -62,7 +59,7 @@ class SIPreEmitPeephole {
   // v_fma_f32 v1, v0, v2, v2
   // Here, we have overwritten v0 before we use it. This function checks if
   // unpacking can lead to such a situation.
-  bool canUnpackingClobberRegister(const MachineInstr &MI);
+  bool canUnpackingClobberRegister(MachineInstr &MI);
   // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
   // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
   // this transformation.
@@ -456,22 +453,8 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
 
 // If support is extended to new operations, add tests in
 // llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
-bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
-  if (!TII->isNeverCoissue(MI))
-    return false;
-  unsigned Opcode = MI.getOpcode();
-  switch (Opcode) {
-  case AMDGPU::V_PK_ADD_F32:
-  case AMDGPU::V_PK_MUL_F32:
-  case AMDGPU::V_PK_FMA_F32:
-    return true;
-  default:
-    return false;
-  }
-  llvm_unreachable("Fully covered switch");
-}
 
-bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
+bool SIPreEmitPeephole::canUnpackingClobberRegister(MachineInstr &MI) {
   unsigned OpCode = MI.getOpcode();
   Register DstReg = MI.getOperand(0).getReg();
   // Only the first register in the register pair needs to be checked due to the
@@ -482,21 +465,9 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
   // Such scenarios can arise due to specific combinations of op_sel and
   // op_sel_hi modifiers.
   Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
-
-  const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
-  if (Src0MO && Src0MO->isReg()) {
-    Register SrcReg0 = Src0MO->getReg();
-    unsigned Src0Mods =
-        TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
-    Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
-                             ? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
-                             : TRI->getSubReg(SrcReg0, AMDGPU::sub0);
-    // Check if the register selected by op_sel_hi is the same as the first
-    // register in the destination register pair.
-    if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
-      return true;
-  }
-
+  uint16_t UnpackedOpCode = mapToUnpackedOpcode(MI);
+  bool UnpackedInstHasOneSrcOp =
+      !AMDGPU::hasNamedOperand(UnpackedOpCode, AMDGPU::OpName::src1);
   const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
   if (Src1MO && Src1MO->isReg()) {
     Register SrcReg1 = Src1MO->getReg();
@@ -505,25 +476,44 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
     Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
                              ? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
                              : TRI->getSubReg(SrcReg1, AMDGPU::sub0);
+    // Check if the register selected by op_sel_hi is the same as the first
+    // register in the destination register pair.
     if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
       return true;
   }
 
-  // Applicable for packed instructions with 3 source operands, such as
-  // V_PK_FMA.
-  if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
-    const MachineOperand *Src2MO =
-        TII->getNamedOperand(MI, AMDGPU::OpName::src2);
-    if (Src2MO && Src2MO->isReg()) {
-      Register SrcReg2 = Src2MO->getReg();
-      unsigned Src2Mods =
-          TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
-      Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
-                               ? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
-                               : TRI->getSubReg(SrcReg2, AMDGPU::sub0);
-      if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
+  // V_MOV_B32s have one src operand. Other candidate unpacked instructions with
+  // 2 or more src operands will perform the following checks.
+  if (!UnpackedInstHasOneSrcOp) {
+    const MachineOperand *Src0MO =
+        TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+    if (Src0MO && Src0MO->isReg()) {
+      Register SrcReg0 = Src0MO->getReg();
+      unsigned Src0Mods =
+          TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
+      Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
+                               ? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
+                               : TRI->getSubReg(SrcReg0, AMDGPU::sub0);
+      if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
         return true;
     }
+
+    // Applicable for packed instructions with 3 source operands, such as
+    // V_PK_FMA.
+    if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
+      const MachineOperand *Src2MO =
+          TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+      if (Src2MO && Src2MO->isReg()) {
+        Register SrcReg2 = Src2MO->getReg();
+        unsigned Src2Mods =
+            TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
+        Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
+                                 ? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
+                                 : TRI->getSubReg(SrcReg2, AMDGPU::sub0);
+        if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
+          return true;
+      }
+    }
   }
   return false;
 }
@@ -540,6 +530,11 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
     return AMDGPU::V_MUL_F32_e64;
   case AMDGPU::V_PK_FMA_F32:
     return AMDGPU::V_FMA_F32_e64;
+  case AMDGPU::V_PK_MOV_B32:
+    // Source modifiers aren't handled for MOV due to prevailing restrictions.
+    // Hence, 64-bit encoding isn't necessary. Will create unnecessary
+    // instruction cache pressure.
+    return AMDGPU::V_MOV_B32_e32;
   default:
     return std::numeric_limits<uint16_t>::max();
   }
@@ -549,6 +544,7 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
 void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
                                           unsigned SrcMods, bool IsHiBits,
                                           const MachineOperand &SrcMO) {
+  unsigned NewOpCode = NewMI->getOpcode();
   unsigned NewSrcMods = 0;
   unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
   unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
@@ -561,12 +557,18 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
   // modifier for the higher 32 bits. Unpacked VOP3 instructions support
   // ABS, but do not support NEG_HI. Therefore we need to explicitly add the
   // NEG modifier if present in the packed instruction.
+  bool IsSrcModifidiersSupported =
+      AMDGPU::hasNamedOperand(NewOpCode, AMDGPU::OpName::src0_modifiers);
+  bool UnpackedInstHasOneSrcOp =
+      !AMDGPU::hasNamedOperand(NewOpCode, AMDGPU::OpName::src1);
+
   if (SrcMods & NegModifier)
     NewSrcMods |= SISrcMods::NEG;
   // Src modifiers. Only negative modifiers are added if needed. Unpacked
   // operations do not have op_sel, therefore it must be handled explicitly as
   // done below.
-  NewMI.addImm(NewSrcMods);
+  if (IsSrcModifidiersSupported)
+    NewMI.addImm(NewSrcMods);
   if (SrcMO.isImm()) {
     NewMI.addImm(SrcMO.getImm());
     return;
@@ -594,7 +596,7 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
     bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
     bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
     bool KillState = true;
-    if ((OpSel == OpSelHi) && !IsHiBits)
+    if ((OpSel == OpSelHi) && !IsHiBits && !UnpackedInstHasOneSrcOp)
       KillState = false;
     UnpackedSrcMO.setIsKill(KillState);
   }
@@ -612,10 +614,12 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
 
   for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
     MachineInstr &Instr = *I;
+    uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
     if (Instr.isMetaInstruction())
       continue;
     if ((Instr.isTerminator()) ||
-        (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
+        (TII->isNeverCoissue(Instr) &&
+         (UnpackedOpCode == std::numeric_limits<uint16_t>::max())) ||
         (SIInstrInfo::modifiesModeRegister(Instr) &&
          Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
       return;
@@ -639,7 +643,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
       if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
         return;
     }
-    if (!isUnpackingSupportedInstr(Instr))
+    if (UnpackedOpCode == std::numeric_limits<uint16_t>::max())
       continue;
 
     if (canUnpackingClobberRegister(Instr))
@@ -661,8 +665,28 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
   MachineOperand DstOp = I.getOperand(0);
 
   uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
-  assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
-         "Unsupported Opcode");
+  // V_MOV_B32 does not support source modifiers. Without source modifiers, we
+  // cannot be faithful to the packed instruction semantics in few cases. This
+  // is true when the packed instruction has NEG and NEG_HI modifiers. We should
+  // abort unpacking if:
+  // 1. hi/lo bits selected by OPSEL for src0 are also marked by NEG or NEG_HI.
+  // 2. hi/lo bits selected by OPSEL_HI for src1 are also marked by NEG or
+  // NEG_HI.
+  // Packed instructions do not specify ABS modifiers, so we can safely ignore
+  // those.
+  if (!AMDGPU::hasNamedOperand(UnpackedOpcode,
+                               AMDGPU::OpName::src0_modifiers)) {
+    unsigned Src0Mods =
+        TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm();
+    unsigned Src1Mods =
+        TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm();
+    unsigned negMask0 =
+        (Src0Mods & SISrcMods::OP_SEL_0) ? SISrcMods::NEG_HI : SISrcMods::NEG;
+    unsigned negMask1 =
+        (Src1Mods & SISrcMods::OP_SEL_1) ? SISrcMods::NEG_HI : SISrcMods::NEG;
+    if ((Src0Mods & negMask0) || (Src1Mods & negMask1))
+      return;
+  }
 
   MachineInstrBuilder Op0LOp1L =
       createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
@@ -689,8 +713,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
                                                         bool IsHiBits) {
   MachineBasicBlock &MBB = *I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
-  const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
+  const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
+  const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
   Register DstReg = I.getOperand(0).getReg();
   unsigned OpCode = I.getOpcode();
   Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
@@ -704,8 +728,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
 
   MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
   NewMI.addDef(UnpackedDstReg); // vdst
-  addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
-  addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
+  if (AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::src0) &&
+      AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::src1)) {
+    addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
+    addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
+  } else {
+    const MachineOperand *SrcMO = IsHiBits ? SrcMO1 : SrcMO0;
+    unsigned SrcMods = IsHiBits ? Src1Mods : Src0Mods;
+    addOperandAndMods(NewMI, SrcMods, IsHiBits, *SrcMO);
+  }
 
   if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
     const MachineOperand *SrcMO3 =
@@ -714,10 +745,12 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
         TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
     addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
   }
-  NewMI.addImm(ClampVal); // clamp
+  if (AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::clamp))
+    NewMI.addImm(ClampVal); // clamp
   // Packed instructions do not support output modifiers. safe to assign them 0
   // for this use case
-  NewMI.addImm(0); // omod
+  if (AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::omod))
+    NewMI.addImm(0); // omod
   return NewMI;
 }
 
@@ -789,22 +822,24 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
 
   // TODO: Fold this into previous block, if possible. Evaluate and handle any
   // side effects.
-  for (MachineBasicBlock &MBB : MF) {
-    // Unpack packed instructions overlapped by MFMAs. This allows the compiler
-    // to co-issue unpacked instructions with MFMA
-    auto SchedModel = TII->getSchedModel();
-    SetVector<MachineInstr *> InstrsToUnpack;
-    for (auto &MI : make_early_inc_range(MBB.instrs())) {
-      if (!SIInstrInfo::isMFMA(MI))
-        continue;
-      const MCSchedClassDesc *SchedClassDesc =
-          SchedModel.resolveSchedClass(&MI);
-      uint16_t NumMFMACycles =
-          SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
-      collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
-    }
-    for (MachineInstr *MI : InstrsToUnpack) {
-      performF32Unpacking(*MI);
+  if (ST.hasGFX950Insts() || ST.hasGFX940Insts()) {
+    for (MachineBasicBlock &MBB : MF) {
+      // Unpack packed instructions overlapped by MFMAs. This allows the
+      // compiler to co-issue unpacked instructions with MFMA
+      auto SchedModel = TII->getSchedModel();
+      SetVector<MachineInstr *> InstrsToUnpack;
+      for (auto &MI : make_early_inc_range(MBB.instrs())) {
+        if (!SIInstrInfo::isMFMA(MI))
+          continue;
+        const MCSchedClassDesc *SchedClassDesc =
+            SchedModel.resolveSchedClass(&MI);
+        uint16_t NumMFMACycles =
+            SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
+        collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
+      }
+      for (MachineInstr *MI : InstrsToUnpack) {
+        performF32Unpacking(*MI);
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
index 75ae76fdee19b..2f7c42f46e0c2 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -641,9 +641,6 @@ body:             |
     ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
     ; GFX950-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
     ; GFX950-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
-    ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX950-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX950-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
     ; GFX950-NEXT: S_ENDPGM 0
     ;
     ; GFX942-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped
@@ -664,9 +661,6 @@ body:             |
     ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
     ; GFX942-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
     ; GFX942-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
-    ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX942-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX942-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
     ; GFX942-NEXT: S_ENDPGM 0
     ;
     ; GFX90A-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped
@@ -687,9 +681,6 @@ body:             |
     ; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
     ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
     ; GFX90A-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
-    ; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90A-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    ; GFX90A-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0
     early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
     renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
@@ -706,9 +697,6 @@ body:             |
     $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
     $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
     $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
-    $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
-    $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 
 ...
@@ -1167,3 +1155,592 @@ body:             |
     $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
     renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     S_ENDPGM 0
+
+...
+---
+name:            test_v_pk_mov_unpacking
+tracksRegLiveness: true
+
+liveins:
+  - { reg: '$sgpr4_sgpr5' }
+
+body:             |
+  bb.0.entry:
+  liveins: $sgpr4_sgpr5
+    ; GFX950-LABEL: name: test_v_pk_mov_unpacking
+    ; GFX950: liveins: $sgpr4_sgpr5
+    ; GFX950-NEXT: {{  $}}
+    ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0
+    ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec
+    ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0
+    ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0
+    ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0
+    ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0
+    ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47
+    ; GFX950-NEXT: $vgpr2_vgpr3 ...
[truncated]


// If support is extended to new operations, add tests in
// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
Copy link
Contributor

@jrbyrnes jrbyrnes Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you handle the removal of this in a separate NFC PR? I need it for something related, and I don't think it should be blocked by adding V_PK_MOV.

; GFX90A-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
; GFX90A-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec
; GFX90A-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec
; GFX90A-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deleting unrelated instructions is a suspicious test change

Comment on lines +1163 to +1166

liveins:
- { reg: '$sgpr4_sgpr5' }

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
liveins:
- { reg: '$sgpr4_sgpr5' }

- { reg: '$sgpr4_sgpr5' }

body: |
bb.0.entry:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bb.0.entry:
bb.0:

Comment on lines +1414 to +1417

liveins:
- { reg: '$sgpr4_sgpr5' }

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't need function liveins

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants